

### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below)
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose:  converts original government-provided .csv payroll files into .parquet and .qs format; 

### Execution time: ~2 hours

### Inputs: 
# 1) /data/raw_csv/country_codigo.csv
# 2) /data/raw_csv/country_contrata.csv
# 3) /data/raw_csv/country_honorarios.csv
# 4) /data/raw_csv/country_planta.csv


### Outputs:
# 1) /data/raw_parquet/country_codigo.parquet
# 2) /data/raw_parquet/country_contrata.parquet
# 3) /data/raw_parquet/country_honorarios.parquet
# 4) /data/raw_parquet/country_planta.parquet

# *5) /data/raw_qs/country_codigo.qs
# *6) /data/raw_qs/country_contrata.qs
# *7) /data/raw_qs/country_honorarios.qs
# *8) /data/raw_qs/country_planta.qs

#
# SET-UP --------------------------------------------------------------------------------------------
#


### clean any objects from the environment
rm(list=ls())
gc()


t0 = Sys.time()

### Source the '00_global.R' script with required packages and functions
source(list.files(
  path = dirname(rstudioapi::getActiveDocumentContext()$path),
  pattern = "global.*\\.R$",
  full.names = TRUE
))


# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)



### >> NOTES   -----------------------------------------------------------------------------------------------------------------------------------------------



# ' ------------------------------------------------------------------------------------------------------------------------------
# CONVERT .CSV   -----------------------------------------------------------------------------------------------------------------------------------------------
# 

# NOTE: Need to read all files, but they are large .csv files originally,
# so they read slowly. So on the first run - read .csv and convert tp .parquet
# Next, row-bind all files together and save for cleaning (so that the merging
# doesn't have to be repeated every time)

### CHOOSE: if run ANEW --------------------------------------------------------------------------------------------
anew = T
i = 'planta'

### Loop ---------------------------------------------------------------------------------------------------------------
# For every file type....
for(i in c('planta', 'contrata', 'honorarios' , 'codigo')){
    
    # ... control loop
    print(i)
    
    # ... check if .parquet exists (saving as .parquet is more universal, allows column selection when
    # re-reading and is often also faster than the main alternative - .qs)
    if(anew | !file.exists(file.path(main_dir, 'data', 'raw_parquet', paste0('country_', i, '.parquet')))){

        # ... if not (and anew = T), read .csv and re-save as .parquet
        # fread() appears the quickest way to read .csv of this size
        print('reading .csv...')
        temp  = fread(file.path(main_dir, 'data', 'raw_csv', paste0('country_', i, '.csv')),
                      encoding = 'Latin-1') %>% clean_names()
        gc()

        print('writing .parquet....')

        write_flex(x = temp,
                   file = file.path(main_dir, 'data', 'raw_parquet', paste0('country_', i)),
                   format = 'parquet')
        
        gc()
      }
      


  
    # ... check if .qs exists
    # if(anew | !file.exists(file.path(main_dir, 'data', 'raw_qs', paste0('country_', i, '.qs')))){
    #     
    #     if(file.exists(file.path(main_dir, 'data', 'raw_parquet', paste0('country_', i, '.parquet')))){
    #       print('reading .parquet...')
    #       
    #       temp = read_flex(file = file.path(main_dir, 'data', 'raw_parquet', paste0('country_', i)), 
    #                        format = 'parquet')
    #     }else{
    #       temp  = fread(file.path(main_dir, 'data', 'raw_csv', paste0('country_', i, '.csv')),
    #                     encoding = 'Latin-1') %>% clean_names()
    #       gc()
    #     }
    # 
    #     gc()
    #     print('writing .qs...')
    #     
    #     write_flex(x = temp,
    #                file = file.path(main_dir, 'data', 'raw_qs', paste0('country_', i)), 
    #                format = 'qs')
    #   }
      
      
    }


### add execution time
exec_time_fun('exec_time')


# ' ------------------------------------------------------------------------------------------------------------------------------------------------------------
# FIN DEL CÓDIGO  --------------------------------------------------------------------------------------------------------------------------------------
# 